R Markdown Plot email Exploring variables

getwd()
## [1] "/Users/pankaj/dev/git/smu/qtw/case_study3"
setwd("/Users/pankaj/dev/git/smu/qtw/case_study3/Week_5_Materials_2")
load("data.Rda") 

length(emailDFrp)
## [1] 30
plot(colSums(is.na(emailDFrp)))

df1= emailDFrp

df1$id = seq(1, length(emailDFrp$isRe))


df1 %>% 
  ggplot(mapping = aes(x= isRe, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= underscore, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= priority, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isInReplyTo, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= sortedRec, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= subPunc, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= multipartText, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= hasImages, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isPGPsigned, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= subSpamWords, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= noHost, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= numEnd, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isYelling, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isOrigMsg, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isDear, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isWrote, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= subExcCt, fill = isSpam))+
  geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).

df1 %>% 
  ggplot(mapping = aes(x= subQuesCt, fill = isSpam))+
  geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).

df1 %>% 
  ggplot(mapping = aes(x= numLines, fill = isSpam))+
  geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

df1 %>% 
  ggplot(mapping = aes(x= numDlr, fill = isSpam))+
  geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

df1 %>% 
  ggplot(mapping = aes(x= numAtt, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= numRec, fill = isSpam))+
  geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
## Warning: Removed 282 rows containing non-finite values (stat_bin).

df1 %>% 
  ggplot(mapping = aes(x= hour, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= perHTML, fill = isSpam))+
  geom_bar()
## Warning: position_stack requires non-overlapping x intervals

df1 %>% 
  ggplot(mapping = aes(x= forwards ,    color = isSpam))+
  geom_freqpoly( binwidth= 1)

df1 %>% 
  ggplot(mapping = aes(x= perCaps ,    color = isSpam))+
  geom_freqpoly(binwidth= 1 )

df1 %>% 
  ggplot(mapping = aes(x= bodyCharCt ,    color = isSpam))+
  geom_freqpoly( binwidth= 100)

df1 %>% 
  ggplot(mapping = aes(x= avgWordLen ,    color = isSpam))+
  geom_freqpoly(binwidth= 1 )

df1 %>% 
  ggplot(mapping = aes(x= subBlanks ,    color = isSpam))+
  geom_freqpoly( binwidth= 1)
## Warning: Removed 20 rows containing non-finite values (stat_bin).

Tree Model

## Confusion Matrix and Statistics
## 
##    pred.rpint1
##        F    T
##   F 3489    3
##   T 1122   60
##                                           
##                Accuracy : 0.7593          
##                  95% CI : (0.7468, 0.7715)
##     No Information Rate : 0.9865          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0727          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.75667         
##             Specificity : 0.95238         
##          Pos Pred Value : 0.99914         
##          Neg Pred Value : 0.05076         
##              Prevalence : 0.98652         
##          Detection Rate : 0.74647         
##    Detection Prevalence : 0.74711         
##       Balanced Accuracy : 0.85452         
##                                           
##        'Positive' Class : F               
## 

Grid search for tree model

train.control <- trainControl(
  method = "repeatedcv",
  number = 10, ## 10-fold CV
  repeats = 3,## repeated three times
  # USE AUC
  summaryFunction = twoClassSummary, 
  classProbs = TRUE
)

tune.gridcart <- expand.grid(maxdepth = 2:15)

#tune.gridcart <- expand.grid(cp = seq(2,10))


system.time (rpartFit2 <- train(isSpam ~ . ,                         
                                data = emailDFrp,
                                method = "rpart2", 
                                tuneGrid =tune.gridcart,
                                trControl = train.control,
                                metric = "ROC", na.action = na.pass
))
##    user  system elapsed 
##  31.103   1.376  13.555
rpartFit2 
## CART 
## 
## 9348 samples
##   29 predictor
##    2 classes: 'F', 'T' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 8413, 8414, 8414, 8413, 8414, 8412, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  ROC        Sens       Spec     
##    2        0.7585786  0.9246642  0.5878144
##    3        0.7761748  0.9501753  0.5849152
##    4        0.7789992  0.9683019  0.5762930
##    5        0.8001320  0.9705070  0.5967329
##    6        0.9161483  0.9513262  0.7352371
##    7        0.9161483  0.9513262  0.7352371
##    8        0.9164148  0.9510385  0.7366318
##    9        0.9177516  0.9530529  0.7377586
##   10        0.9187233  0.9554506  0.7377586
##   11        0.9255864  0.9471549  0.7707078
##   12        0.9289536  0.9450444  0.7860094
##   13        0.9348707  0.9428388  0.8093579
##   14        0.9363542  0.9450928  0.8128382
##   15        0.9371638  0.9479696  0.8149262
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 15.
plot(rpartFit2)

From above grid search it looks like ROC curve area increases with number of depth but the increase is not much after max depth 6. so that one we ll choose for our final model

Tree Model with all variables

emailDFrp.rp <-rpart(isSpam ~ . ,                         # formula
                     data = emailDFrp,                       # data frame
                     subset = train,                       # indices of training set
                     method = "class",                     # classification tree
                     parms = list(split = "information"),  # use entropy/deviance
                     maxsurrogate = 0,                     # since no missing values
                     cp = 0,                               # no size penalty
                     minsplit = 5,                         # Nodes of size 5 (or # morecan be split,
                     minbucket = 2,
                     # provided each sub-node
                     # contains at least 2 obs.
                     control=rpart.control(maxdepth=6)
)


#summary(emailDFrp.rp)

plot(emailDFrp.rp, 
     uniform=TRUE,
     compress=TRUE,
     margin = .2)
text(emailDFrp.rp, 
     use.n=TRUE, 
     all = TRUE,
     fancy = TRUE)

fancyRpartPlot(emailDFrp.rp, main="Decision Tree Graph")

Evaluation of Model :

confusion matrix for model based on all vars

pred.rpint <- predict(emailDFrp.rp,
                       newdata = emailDFrp[-train,],
                       type = "class")


#
#  Classification table on the test data
#

cm = table(emailDFrp$isSpam[-train], pred.rpint)

confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    pred.rpint
##        F    T
##   F 3387  105
##   T  490  692
##                                           
##                Accuracy : 0.8727          
##                  95% CI : (0.8628, 0.8821)
##     No Information Rate : 0.8295          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6224          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8736          
##             Specificity : 0.8683          
##          Pos Pred Value : 0.9699          
##          Neg Pred Value : 0.5854          
##              Prevalence : 0.8295          
##          Detection Rate : 0.7246          
##    Detection Prevalence : 0.7471          
##       Balanced Accuracy : 0.8709          
##                                           
##        'Positive' Class : F               
## 

H2O random forest model

# Build and train the model:

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 days 6 hours 
##     H2O cluster timezone:       Asia/Singapore 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    2 months and 13 days  
##     H2O cluster name:           H2O_started_from_R_pankaj_qto124 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.26 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.0 (2019-04-26)
emailDHex = as.h2o(emailDFrp)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
splits = h2o.splitFrame(data = emailDHex, ratios = .8, seed = 1234)
trainHex = splits[[1]]
validHex = splits[[2]]
predictors = c("isRe")
predictors = names(emailDFrp)[c(seq(2,29))]
response = c("isSpam")


emailD_2tree <- h2o.randomForest(x = predictors,
                                  y = response,
                                  ntrees = 5,
                                  max_depth = 5,
                                  min_rows = 20,
                                  binomial_double_trees = TRUE,
                                  training_frame = trainHex,
                                  validation_frame = validHex)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=================================================================| 100%
emailDH2oTree2 = h2o.getModelTree(model = emailD_2tree, tree_number = 2)

library(data.tree)


library(DiagrammeR)

emailDataTree = createDataTree(emailDH2oTree2)

GetEdgeLabel <- function(node) {return (node$edgeLabel)}
GetNodeShape <- function(node) {switch(node$type, 
                                       split = "diamond", leaf = "oval")}
GetFontName <- function(node) {switch(node$type, 
                                      split = 'Palatino-bold', 
                                      leaf = 'Palatino')}
SetEdgeStyle(emailDataTree, fontname = 'Palatino-italic', 
             label = GetEdgeLabel, labelfloat = TRUE,
             fontsize = "26", fontcolor='royalblue4')
SetNodeStyle(emailDataTree, fontname = GetFontName, shape = GetNodeShape, 
             fontsize = "26", fontcolor='royalblue4',
             height="0.75", width="1")

SetGraphStyle(emailDataTree, rankdir = "LR", dpi=70.)

plot(emailDataTree, output = "graph")
emailDataTree
##                                     levelName
## 1  bodyCharCt                                
## 2   ¦--isRe                                  
## 3   ¦   ¦--numEnd                            
## 4   ¦   ¦   ¦--sortedRec                     
## 5   ¦   ¦   ¦   ¦--prediction: 0             
## 6   ¦   ¦   ¦   °--perCaps                   
## 7   ¦   ¦   ¦       ¦--prediction: 0.5       
## 8   ¦   ¦   ¦       °--prediction: 0.07070707
## 9   ¦   ¦   °--perHTML                       
## 10  ¦   ¦       ¦--hour                      
## 11  ¦   ¦       ¦   ¦--prediction: 0.89816934
## 12  ¦   ¦       ¦   °--prediction: 0.733871  
## 13  ¦   ¦       °--prediction: 0             
## 14  ¦   °--isInReplyTo                       
## 15  ¦       ¦--subBlanks                     
## 16  ¦       ¦   ¦--hour (L)                  
## 17  ¦       ¦   ¦   ¦--prediction: 0.95348835
## 18  ¦       ¦   ¦   °--prediction: 1         
## 19  ¦       ¦   °--hour (R)                  
## 20  ¦       ¦       ¦--prediction: 1         
## 21  ¦       ¦       °--prediction: 0.95      
## 22  ¦       °--prediction: 1                 
## 23  °--numRec                                
## 24      ¦--isInReplyTo                       
## 25      ¦   ¦--perCaps                       
## 26      ¦   ¦   ¦--numRec                    
## 27      ¦   ¦   ¦   ¦--prediction: 0.6415094 
## 28      ¦   ¦   ¦   °--prediction: 0.8785047 
## 29      ¦   ¦   °--bodyCharCt                
## 30      ¦   ¦       ¦--prediction: 0.11678832
## 31      ¦   ¦       °--prediction: 0.82857144
## 32      ¦   °--prediction: 1                 
## 33      °--prediction: 0
h2o.shutdown()
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?